import numpy as np
import pandas as pd
import re
import datetime
from plotly.offline import init_notebook_mode, iplot, plot
import plotly as py
import plotly.express as px
init_notebook_mode(connected=True)
from plotly.subplots import make_subplots
import plotly.graph_objs as go
import matplotlib.pyplot as plt
import os
air_visits = pd.read_csv('./Data/air_visit_data.csv')
air_reserve = pd.read_csv('./Data/air_reserve.csv')
hpg_reserve = pd.read_csv('./Data/hpg_reserve.csv')
air_store = pd.read_csv('./Data/air_store_info.csv')
hpg_store = pd.read_csv('./Data/hpg_store_info.csv')
holidays = pd.read_csv('./Data/date_info.csv')
store_ids = pd.read_csv('./Data/store_id_relation.csv')
test_data = pd.read_csv('./Data/sample_submission.csv')
# insert whichever dataframe you want to view here:
air_visits.describe()
| visitors | |
|---|---|
| count | 252108.000000 |
| mean | 20.973761 |
| std | 16.757007 |
| min | 1.000000 |
| 25% | 9.000000 |
| 50% | 17.000000 |
| 75% | 29.000000 |
| max | 877.000000 |
# quick look at number of rows and cols
air_visits.shape
(252108, 3)
# can apply this to all files
missing_values = air_visits.isna().sum()
missing_values
air_store_id 0 visit_date 0 visitors 0 dtype: int64
# we will take a look at the number of visist to the AIR restaurants
# first plot wll look to group visits by visit date and sum the # of visitors
p1 = air_visits.groupby(['visit_date'], as_index=False).sum()
fig = px.line(p1, x='visit_date', y='visitors', title='Total Visitors by Visit Date')
fig.show()
# display the value (in chart) as log based visitors, but have the tick marks are the "true" value
fig=px.histogram(air_visits, x='visitors', title='Occurences of Visitor count')
fig.update_xaxes(type='log', range=[0,2])
fig.update_layout(bargap=0.05, xaxis={'tickmode':'array',
'tickvals': pd.to_numeric([f"{n:.1g}" for n in np.geomspace(1, air_visits['visitors'].max(), 15)]),})
fig.show()
# plot the max visitors by air_id and the count of occurences
p1 = air_visits.groupby(['air_store_id'], as_index=False).max()
fig=px.histogram(p1, x='visitors', title='Occurences of Max Visitor count by store id')
fig.update_layout(bargap=0.1)
fig.show()
# Creating a temp dataframe copy from air_visits in order to generate a day of the week column.
# Day of week: Monday = 0 Sunday = 6
df_tmp = air_visits.copy()
df_tmp['date'] = pd.to_datetime(df_tmp['visit_date'])
df_tmp['wday'] = df_tmp['date'].dt.dayofweek.astype(str)
p1 = df_tmp.groupby(['wday'], as_index=False).median()
fig = px.bar(p1, x='wday', y='visitors', color = 'wday')
fig.update_layout(coloraxis={'showscale': False}, showlegend=False, xaxis={'tickmode': 'array', 'tickvals': [0,1,2,3,4,5,6], 'ticktext': ['Mon', 'Tue', 'Wed', 'Thur', 'Fri', 'Sat', 'Sun']},
xaxis_title='Day of the Week', yaxis_title= 'Median Visitors')
fig.show()
# Extractingt he month from the temp dataframe used above and creatinga new columns called 'Month' in order to graph the median visitors by month
df_tmp['month'] = df_tmp['date'].dt.month.astype(str)
p2 = df_tmp.groupby(['month'], as_index=False).median()
fig = px.bar(p2, x='month', y='visitors', color = 'month')
fig.update_layout(coloraxis={'showscale': False}, showlegend=False, xaxis={'tickmode': 'array', 'tickvals': [1,2,3,4,5,6,7,8,9,10,11,12],
'ticktext': ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']},
xaxis_title='Month', yaxis_title= 'Median Visitors')
fig.show()
# plotting the above except by month and year
df_tmp['year'] = df_tmp['date'].dt.year.astype(str)
df_tmp['year_month'] = df_tmp['year'] + '-' + df_tmp['month']
p3 = df_tmp.groupby(['year_month'], as_index=False).agg({'visitors': 'median'})
p3['month'] = pd.to_datetime(p3['year_month']).dt.month.astype(str)
fig = px.bar(p3, x='year_month', y='visitors', color='month')
fig.update_layout(coloraxis={'showscale': False}, showlegend=False, xaxis_title='Year-Month', yaxis_title= 'Median Visitors')
fig.show()
df_apr_may_2016 = df_tmp[ (df_tmp['date'] > '2016-04-15') & (df_tmp['date'] < '2016-06-15') ]
p1 = df_apr_may_2016.groupby(['date'], as_index=False).agg({'visitors':'sum'})
fig1 = px.line(p1, x='date', y='visitors')
fig2 = px.scatter(p1, x='date', y='visitors', trendline='rolling', trendline_options={'window': 3}, color_discrete_sequence=px.colors.qualitative.Pastel1)
fig3 = go.Figure(data=fig1.data + fig2.data)
fig3.update_layout(xaxis_title='Date', yaxis_title= 'All Visitors')
fig3.show()
# fig=go.Figure()
# fig.add_trace(go.Scatter(x=p1['date'], y=p1['visitors'], line_shape='spline'))
# fig.add_trace(go.Scatter(x=p1['date'], y=p1['visitors'], mode='lines'))
# fig.show()
Looking at this plot, we can see the blue line which corresponds to the raw data (total visitors by day), and an approximated pink line with a smoothing fit. There is a drop compared to the overall pattern from May 1st to just before May 8th, this corresponds to the holiday Golden Week.
We can do a similar datetime visualization with the reservtion dataset as this includes a field that shows the visitor count on that same day. We can also take a look at the time difference bewtween the time the reservation was made at, and when the reservation was made for.
# working with the air_reservation data now, will work to extract month, hour, and weekday values for each of the reservation fields and the visit fields
foo = air_reserve.copy()
# converting and extracting the reservation data
foo['reserve_date'] = pd.to_datetime(foo['reserve_datetime'])
foo['reserve_hour'] = foo['reserve_date'].dt.hour
foo['reserve_wday'] = foo['reserve_date'].dt.dayofweek
foo['reserve_date'] = foo['reserve_date'].dt.date
# converting and extracting the visit data
foo['visit_date'] = pd.to_datetime(foo['visit_datetime'])
foo['visit_hour'] = foo['visit_date'].dt.hour
foo['visit_wday'] = foo['visit_date'].dt.dayofweek
foo['visit_date'] = foo['visit_date'].dt.date
# calculating the difference in time from reservation to visit (visit time - reservation time, result in HOURS)
foo['diff_hour'] = pd.to_datetime(foo['visit_datetime']) - pd.to_datetime(foo['reserve_datetime'])
foo['diff_hour'] = foo['diff_hour']/np.timedelta64(1, 'h')
# calculating the difference in time from reservation to visit (visit time - reservation time, result in DAYS)
foo['diff_day'] = pd.to_datetime(foo['visit_datetime']) - pd.to_datetime(foo['reserve_datetime'])
foo['diff_day'] = foo['diff_day'].dt.days
p1 = foo.groupby(['visit_date'], as_index=False).agg({'reserve_visitors':'sum'})
fig = px.line(p1, x='visit_date', y='reserve_visitors')
fig.update_xaxes(title='AIR Visit Date')
fig.update_yaxes(title='Total Reservations')
fig.show()
p2 = foo.groupby(['visit_hour'], as_index=False).agg({'reserve_visitors': 'sum'})
fig2 = px.bar(p2, x='visit_hour', y='reserve_visitors')
fig2.update_xaxes(title='AIR Visit Hour')
fig2.update_yaxes(title='Total Reservations')
fig2.show()
# only showing time gaps shorter than 5 days out from the time of reservation as some values go out to over a year
p3 = foo.groupby(['diff_hour'], as_index=False).agg({'reserve_visitors': 'sum'})
p3 = p3[ p3['diff_hour'] < 25*5 ]
fig3 = px.bar(p3, x='diff_hour', y='reserve_visitors')
fig3.update_xaxes(title='AIR: Time from Reservation to Visit (hrs)')
fig3.update_yaxes(title='Total Reservations')
fig3.show()
tmp = foo.sort_values(['diff_day'], ascending=False)
tmp.iloc[:5, [0,1,2,11]]
| air_store_id | visit_datetime | reserve_datetime | diff_day | |
|---|---|---|---|---|
| 59263 | air_e7fbee4e3cfe65c5 | 2017-02-07 20:00:00 | 2016-01-11 17:00:00 | 393 |
| 59257 | air_e7fbee4e3cfe65c5 | 2017-02-07 20:00:00 | 2016-01-12 20:00:00 | 392 |
| 59262 | air_e7fbee4e3cfe65c5 | 2017-02-07 20:00:00 | 2016-01-18 18:00:00 | 386 |
| 50826 | air_2a485b92210c98b5 | 2017-01-18 20:00:00 | 2016-01-02 00:00:00 | 382 |
| 50805 | air_2a485b92210c98b5 | 2017-01-18 20:00:00 | 2016-01-02 20:00:00 | 382 |
This is the top 5 longest differences (by day) in reservations to visit dates. This contains only 2 unique air store ids, must be very indemand locations where this type of reservation is standard or they could be input errors with incorrect years.
We will now shift gears towards looking at the HPG data, similar dataset as the AIR info but for a different app to schedule reservations. We will perform the same chartings as above for:
# working with the hpg_reservation data now, will work to extract month, hour, and weekday values for each of the reservation fields and the visit fields
foo = hpg_reserve.copy()
# converting and extracting the reservation data
foo['reserve_date'] = pd.to_datetime(foo['reserve_datetime'])
foo['reserve_hour'] = foo['reserve_date'].dt.hour
foo['reserve_wday'] = foo['reserve_date'].dt.dayofweek
foo['reserve_date'] = foo['reserve_date'].dt.date
# converting and extracting the visit data
foo['visit_date'] = pd.to_datetime(foo['visit_datetime'])
foo['visit_hour'] = foo['visit_date'].dt.hour
foo['visit_wday'] = foo['visit_date'].dt.dayofweek
foo['visit_date'] = foo['visit_date'].dt.date
# calculating the difference in time from reservation to visit (visit time - reservation time, result in HOURS)
foo['diff_hour'] = pd.to_datetime(foo['visit_datetime']) - pd.to_datetime(foo['reserve_datetime'])
foo['diff_hour'] = foo['diff_hour']/np.timedelta64(1, 'h')
# calculating the difference in time from reservation to visit (visit time - reservation time, result in DAYS)
foo['diff_day'] = pd.to_datetime(foo['visit_datetime']) - pd.to_datetime(foo['reserve_datetime'])
foo['diff_day'] = foo['diff_day'].dt.days
p1 = foo.groupby(['visit_date'], as_index=False).agg({'reserve_visitors':'sum'})
fig = px.line(p1, x='visit_date', y='reserve_visitors')
fig.update_xaxes(title='HPG Visit Date')
fig.update_yaxes(title='Total Reservations')
fig.show()
p2 = foo.groupby(['visit_hour'], as_index=False).agg({'reserve_visitors': 'sum'})
fig2 = px.bar(p2, x='visit_hour', y='reserve_visitors')
fig2.update_xaxes(title='HPG Visit Hour')
fig2.update_yaxes(title='Total Reservations')
fig2.show()
# only showing time gaps shorter than 5 days out from the time of reservation as some values go out to over a year
p3 = foo.groupby(['diff_hour'], as_index=False).agg({'reserve_visitors': 'sum'})
p3 = p3[ p3['diff_hour'] < 25*5 ]
fig3 = px.bar(p3, x='diff_hour', y='reserve_visitors')
fig3.update_xaxes(title='HPG: Time from Reservation to Visit (hrs)')
fig3.update_yaxes(title='Total Reservations')
fig3.show()
# using air_store we can take a look at the lats and longs provided of the stores
fig = px.density_mapbox(air_store, lat='latitude', lon='longitude', mapbox_style='carto-positron')
fig.update_layout()
fig.show()
To get a more numerical view, lets plot out the count of restaurants by area, and count of categories. we'll stick to the top 15 if the lists become to cumbersome to view.
foo = air_store.copy()
p1 = foo.groupby(['air_genre_name'], as_index=False).size()
p1.rename(columns={'size':'count'}, inplace=True)
p1.sort_values(['count'], ascending=False, inplace=True)
fig = px.bar(p1, x='count', y='air_genre_name', orientation='h', color='air_genre_name')
fig.update_xaxes(title='Count of Genre Type (AIR)')
fig.update_yaxes(title='Genre Name')
fig.update_layout(showlegend=False)
fig.show()
p2 = foo.groupby(['air_area_name'], as_index=False).size()
p2.rename(columns={'size':'count'}, inplace=True)
p2.sort_values(['count'], ascending=False, inplace=True)
p2 = p2.iloc[:15, :]
fig2 = px.bar(p2, x='count', y='air_area_name', orientation='h', color='air_area_name')
fig2.update_xaxes(title='Num of Restaurants (AIR)')
fig2.update_yaxes(title='Area Name (Top 15)')
fig2.update_layout(showlegend=False)
fig2.show()
# using hpg_store we can take a look at the lats and longs provided of the stores
fig = px.density_mapbox(hpg_store, lat='latitude', lon='longitude', mapbox_style='carto-positron')
fig.update_layout()
fig.show()
foo = hpg_store.copy()
p1 = foo.groupby(['hpg_genre_name'], as_index=False).size()
p1.rename(columns={'size':'count'}, inplace=True)
p1.sort_values(['count'], ascending=False, inplace=True)
p1 = p1.iloc[:15, :]
fig = px.bar(p1, x='count', y='hpg_genre_name', orientation='h', color='hpg_genre_name')
fig.update_xaxes(title='Count of Genre Type (HPG)')
fig.update_yaxes(title='Genre Name (Top 15)')
fig.update_layout(showlegend=False)
fig.show()
p2 = foo.groupby(['hpg_area_name'], as_index=False).size()
p2.rename(columns={'size':'count'}, inplace=True)
p2.sort_values(['count'], ascending=False, inplace=True)
p2 = p2.iloc[:15, :]
fig2 = px.bar(p2, x='count', y='hpg_area_name', orientation='h', color='hpg_area_name')
fig2.update_xaxes(title='Num of Restaurants (HPG)')
fig2.update_yaxes(title='Area Name (Top 15)')
fig2.update_layout(showlegend=False)
fig2.show()
# air restaurants
foo = air_store.copy()
foo['trunc_area'] = foo['air_area_name'].str.split(r'[-\s]', expand=True)[0]
p1 = foo.groupby(['trunc_area'], as_index=False).size()
p1.rename(columns={'size':'count'}, inplace=True)
p1.sort_values(['count'], ascending=False, inplace=True)
fig = px.bar(p1, x='count', y='trunc_area', orientation='h', color='trunc_area')
fig.update_xaxes(title='Num of Restaurants (AIR)')
fig.update_yaxes(title='Prefecture Name')
fig.update_layout(showlegend=False)
fig.show()
# hpg restaurants
foo = hpg_store.copy()
foo['trunc_area'] = foo['hpg_area_name'].str.split(r'[-\s]', expand=True)[0]
p1 = foo.groupby(['trunc_area'], as_index=False).size()
p1.rename(columns={'size':'count'}, inplace=True)
p1.sort_values(['count'], ascending=False, inplace=True)
fig = px.bar(p1, x='count', y='trunc_area', orientation='h', color='trunc_area')
fig.update_xaxes(title='Num of Restaurants (HPG)')
fig.update_yaxes(title='Prefecture Name')
fig.update_layout(showlegend=False)
fig.show()
Taking a look at the adjusted plots taking into consideration just the top level prefecture, Tokyo is by far the most accounted for area with Osaka and Fukuoka filling out the top 3 for number of restaurants.
We now shift focus to taking a top level look at the holidays in Japan from the provided holidays dataset. This will provide some context to the total number of holidays and their potential impact on reservations/visitors. We can also look at where these holidays land along our date range for the provided data.
# looking at the date_info file imported as holidays
p1 = holidays.groupby('holiday_flg', as_index=False).size()
fig = px.bar(p1, x='holiday_flg', y='size', color='holiday_flg')
fig.update_xaxes(title='Holiday Flag')
fig.update_yaxes(title='Count')
fig.update_layout(showlegend=False, coloraxis={'showscale': False})
# prepare date split for 2016 and 2017 data
p2 = holidays.copy()
p2 = p2.set_index(['calendar_date'])
p2 = p2.loc['2016-04-15' : '2016-06-01']
p2.reset_index(inplace=True)
p3 = holidays.copy()
p3 = p3.set_index(['calendar_date'])
p3 = p3.loc['2017-04-15' : '2017-06-01']
p3.reset_index(inplace=True)
# make dual plot with 2016 and 2017
fig = make_subplots(rows=2, cols=1, vertical_spacing=0.1)
fig.add_trace(go.Scatter(x=p2['calendar_date'], y=p2['holiday_flg'], mode='markers', marker_color=p2['holiday_flg']), row=1, col=1)
fig.add_trace(go.Scatter(x=p3['calendar_date'], y=p3['holiday_flg'], mode='markers', marker_color=p3['holiday_flg']), row=2, col=1)
fig.update_layout(height=600, width=800, title_text="Holidays in dataset", showlegend=False, title_x=0.5)
fig.show()
holidays['holiday_flg'].agg({'holiday_flg': 'mean'})
holiday_flg 0.067698 Name: holiday_flg, dtype: float64
We find that the same days were holidays in 2016 and 2017 in late April and May, additionally, almost 7% of the dates listed are holidays
# create unique date set for the training data
foo = air_visits.copy()
foo.rename(columns={'visit_date':'date'}, inplace=True)
foo.drop_duplicates(subset=['date'], inplace=True)
foo['dset'] = 'train'
# create unique date set for the testing data
bar = test_data.copy()
bar[['foo', 'bar', 'date']] = bar['id'].str.split('_', -1, expand=True)
bar.drop_duplicates(subset=['date'])
bar['dset'] = 'test'
baz = pd.concat([foo, bar], axis=0, ignore_index=True)
baz['year'] = pd.DatetimeIndex(baz['date']).year
baz['year'] = baz['year'].astype(str)
baz['date'] = baz['date'].str.replace('2016', '2017', regex=False)
baz = baz[baz['date'].notna()]
fig = px.scatter(baz, x='date', y='year', color='dset')
fig.update_traces(marker={'size':10 })
fig.layout.xaxis.tickvals = pd.date_range('2017-01', '2018-01', freq='MS')
fig.layout.xaxis.tickformat = '%b'
fig.show()
We can see that the test dataset, in red, comes in about the last week of April and encompasses up to June of 2017. Our training set has a full year+ of data to work with.
We can start to combine features and analyze their relationships, which in trn need to be interpreted in the context of the individual features distributions, similar to what we did above for some features.
We'll start with average AIR restaurant visitors in relation to the type of cuisine (genre)
# join our visit information and genre information into a new df
foo = air_visits.merge(air_store, on = 'air_store_id', how='left')
foo = foo.groupby(['visit_date', 'air_genre_name'], as_index=False).agg({'visitors': 'mean'})
genres = list(foo.groupby('air_genre_name'))
subplot_titles = [genre[0] for genre in genres]
rows = 4
cols = 4
fig = make_subplots(rows=rows, cols=cols, vertical_spacing=0.15, subplot_titles=subplot_titles, shared_yaxes=True)
for i, l in enumerate(genres):
row = i // cols + 1
col = (i % rows) + 1
df = l[1]
fig.add_trace(go.Scatter(x=df['visit_date'], y=df['visitors'], showlegend=False), row=row, col=col)
fig.update_layout(height=800, width=1300, title_text="Average number of visitors by genre (AIR)")
# update the yaxis values for each plot
for ax in fig['layout']:
if ax[:5] == 'yaxis':
fig['layout'][ax]['range'] = [0,100]
fig.show()
We can see from these plots that there is some semplance of a weekly pattern which we saw in earlier plots of visitor data, with this we can take a look at the mean values by day of week and genre.
foo = air_visits.merge(air_store, on = 'air_store_id', how='left')
foo['wday'] = pd.to_datetime(foo['visit_date']).dt.dayofweek
foo['wday'] = foo['wday'].astype(str)
p1 = foo.groupby(['wday', 'air_genre_name'], as_index=False).agg({'visitors':'mean'})
fig = px.scatter(p1, x='visitors', y='air_genre_name', color='wday')
fig.update_traces(marker={'size': 15})
fig.for_each_trace(lambda t: t.update(name=t.name.replace("0", "Monday")))
fig.for_each_trace(lambda t: t.update(name=t.name.replace("1", "Tuesday")))
fig.for_each_trace(lambda t: t.update(name=t.name.replace("2", "Wednesday")))
fig.for_each_trace(lambda t: t.update(name=t.name.replace("3", "Thursday")))
fig.for_each_trace(lambda t: t.update(name=t.name.replace("4", "Friday")))
fig.for_each_trace(lambda t: t.update(name=t.name.replace("5", "Saturday")))
fig.for_each_trace(lambda t: t.update(name=t.name.replace("6", "Sunday")))
fig.update_layout(height=1000, width=800)
fig.show()
foo = air_visits.merge(air_store, on = 'air_store_id', how='left')
genres = list(foo.groupby('air_genre_name'))
subplot_titles = [genre[0] for genre in genres]
fig=go.Figure()
for i, l in enumerate(genres):
df = l[1]
fig.add_trace(go.Box(x=df['visitors'], showlegend=False, name=subplot_titles[i]))
fig.update_xaxes(title='Visitor Count (log)', type='log')
fig.update_layout(height=600, width=1000, title='Visitor Distribution by Genre (AIR)')
fig.show()
foo = 1
bar = 1
baz = 1
p1 = 1
p2 = 1
p3 = 1
df_tmp = 1
We can take a deeper look in relation to the holiday data we previously saw, let's see the impact of holidays on average visitor data overall rather than by category for now:
foo = air_visits.rename(columns={'visit_date':'calendar_date'})
bar = foo.merge(holidays, on = 'calendar_date', how='left')
bar['holiday_flg'] = bar['holiday_flg'].astype(bool)
fig = px.box(bar, x='holiday_flg', y='visitors', color='holiday_flg')
fig.update_yaxes(type='log')
fig.show()
bar['wday'] = pd.to_datetime(bar['calendar_date']).dt.dayofweek
p1 = bar.groupby(['wday', 'holiday_flg'], as_index=False).agg({'visitors': 'mean'})
wdays= ['Mon', 'Tues', 'Wed', 'Thurs', 'Fri', 'Sat', 'Sun']
fig = px.scatter(p1, x='wday', y='visitors', color='holiday_flg')
fig.update_traces(marker={'size': 15})
fig.update_layout(height=600, width=600)
fig.layout.xaxis.tickvals = [i for i in range(len(wdays))]
fig.layout.xaxis.ticktext = wdays
fig.show()
Based on the first plot, I don't see much of a shift in average visitors holiday vs. not holiday. To see this with a bit more clarity, the second plot shows the breakdown by day of week and we see that holidays that land on weekdays (Mon-Fri) have a higher average visitor count with the weekend showing a negative impact on Saturday and only a slight positive impact on Sunday.